/** * Copyright (C) 2011 Twitter, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this * file except in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR * CONDITIONS OF ANY KIND, either express or implied. See the License for the * specific language governing permissions and limitations under the License. */ package com.cloudhopper.commons.charset; import java.text.Normalizer; /** * Utility class for working with text used on mobile phones (primarily SMS). * Helpful methods for converting unicode characters into their ascii equivalents * such as smart quotes to dumb quotes. * * @author jlauer */ public class MobileTextUtil { // source-char, replace-char // http://en.wikipedia.org/wiki/Quotation_mark_glyphs static public final char[][] CHAR_TABLE = { { '\u2013', '-' }, { '\u2014', '-' }, { '\u2018', '\'' }, { '\u2019', '\'' }, { '\u201A', '\'' }, { '\u201C', '"' }, { '\u201D', '"' }, { '\u201E', '"' }, { '\u2020', '+' }, { '\u2022', '.' }, { '\u2026', '.' }, // actually ... { '\u2039', '<' }, { '\u203A', '>' } }; /** * Replace unicode characters with their ascii equivalents, limiting * replacement to "safe" characters such as smart quotes to dumb quotes. * "Safe" is subjective, but generally the agreement is that these character * replacements should not change the meaning of the string in any meaninful * way. * * @param buffer The buffer containing the characters to analyze and replace * if necessary. * @return The number of characters replaced */ static public int replaceSafeUnicodeChars(StringBuilder buffer) { int replaced = 0; for (int i = 0; i < buffer.length(); i++) { char c = buffer.charAt(i); for (int j = 0; j < CHAR_TABLE.length; j++) { if (c == CHAR_TABLE[j][0]) { replaced++; buffer.setCharAt(i, CHAR_TABLE[j][1]); } } } return replaced; } /** * Replace accented characters with their ascii equivalents. For example, * convert é to e.<br><br> * NOTE: This method is not very efficient. The String will be copied * twice during conversion, so you'll likely only want to run this against * small strings. * * @param buffer The buffer containing the characters to analyze and replace * if necessary. * @return The number of characters replaced */ public static int replaceAccentedChars(StringBuilder buffer) { // save the size before we strip out the accents int sizeBefore = buffer.length(); // each accented char will be converted into 2 chars -- the ascii version // followed by the accent character String s = Normalizer.normalize(buffer, Normalizer.Form.NFD); // new size will include accented chars int sizeAfter = s.length(); // efficiency check #1 - if the length hasn't changed, do nothing int replaced = sizeAfter - sizeBefore; if (replaced <= 0) { return 0; } // replace the accents with nothing s = s.replaceAll("\\p{InCombiningDiacriticalMarks}+", ""); buffer.setLength(0); buffer.append(s); return replaced; } }